from pymysql import NULL
import getData
import numpy as np
import numpy as np
from utils.date_utils.date_function import is_weekday_commit\
    ,project_age,get_latency_after_response,get_waiting_time
from utils.num_utils.num_function import get_label_count\
    ,get_workload,get_prev_prs,get_change_num\
    ,get_accept_num,get_close_num,get_review_num\
    ,get_participants_count
from utils.str_utils.str_function import wordCount
from utils.num_utils.num_ratio_function import get_pr_author_rate\
    ,get_project_line_rate,get_line_weekday_rate,get_project_line_churn_rate\
        ,get_commits_average,get_avg_comments,get_avg_latency
import os
basedir = os.path.abspath(os.path.dirname(__file__))



#从数据库获取数据
raw_data=getData.getDataFromSql(
    "select * from pr_self"
)
print(len(raw_data))##查看PR数量

useful_features_index=[0,##pr_number
    2,##repo_name
    3,##pr_user_id
    5,##pr_author_association
    8,##labels
    10,##created_at
    12,##closed_at
    13,##merged_at
    14,##merged
    16,##mergeable_state
    18,##assignees_content
    20,##comments_number
    21,##comments_content
    22,##review_comments_number
    23,##review_comments_content
    24,##commit_number
    26,##changed_file_num
    27,##total_add_line
    28,##total_delete_line
    4,##pr_user_name
    11,##updated_at
    6,##title
    7,##body
    ]



selected_data=[] ##保留有用的属性特征
for item in raw_data:
    tmp=[]
    for i in useful_features_index:
        tmp.append(item[i])
    selected_data.append(tmp)



process_data=[]
count=0
for item in selected_data:
    tmp=[]
    ##pr_number
    tmp.append(item[0])
    ##repo_name
    tmp.append(item[1])
    ##pr_user_id
    tmp.append(item[2])
    ####pr_author_association
    if item[3]=='CONTRIBUTOR':
        tmp.append(0)
    elif item[3]=='MEMBER':
        tmp.append(1)
    elif item[3]=='NONE':
        tmp.append(2)
    else:
        tmp.append(3)

    ##labels
    if item[4]=='{}':
        tmp.append(0)
    else:
        tmp.append(1)
    
    ##created_at
    tmp.append(item[5])
    ##closed_at
    tmp.append(item[6])
    ##merged_at
    tmp.append(item[7])

    ##merged
    tmp.append(item[8])

    ##mergeable_state
    if item[9]=='unknown':
        tmp.append(0)
    elif item[9]=='blocked':
        tmp.append(1)
    elif item[9]=='dirty':
        tmp.append(2)
    else:
        tmp.append(3)

    ##assignees_content
    if item[10]=='{}':
        tmp.append(0)
    else:
        tmp.append(1)

    ##comments_number
    tmp.append(item[11])
    
    ##comments_content
    tmp.append(item[12])
    ##review_comments_number
    tmp.append(item[13])
    ##review_comments_content
    tmp.append(item[14])
    ##commit_number
    tmp.append(item[15])
    ##changed_file_num
    tmp.append(item[16])
    ##total_add_line
    tmp.append(item[17])
    ##total_delete_line
    tmp.append(item[18])

    ##pr_username
    tmp.append(item[19])

    ##pr_id
    tmp.append(count)

    ##updated_at
    tmp.append(item[20])
    
    ##content
    tmp.append(item[4])

    ##title
    tmp.append(item[21])

    ##body
    tmp.append(item[22])

    count+=1
    process_data.append(tmp)

print(count)

# 增加代码的可读性
pr_number_index = 0
repo_name_index = 1
pr_user_id_index = 2
pr_author_association_index = 3
labels_index = 4
created_at_index = 5
closed_at_index = 6
merged_at_index = 7
merged_index = 8
mergeable_state_index = 9
assignees_content_index = 10
comments_number_index = 11
comments_content_index = 12
review_comments_number_index = 13
review_comments_content_index = 14
commit_number_index = 15
changed_file_num_index = 16
total_add_line_index = 17
total_delete_line_index = 18
pr_user_name_index = 19
pr_id_index = 20
updated_at_index = 21
content_index = 22
title_index = 23
body_index = 24



##获得仓库年龄
repo_data=getData.getDataFromSql(
    "select repo_id,repo_name\
        ,project_created_at,project_updated_at\
        ,project_pushed_at from pr_repo"
)

repo_id_index=0
repo_data_name_index=1
project_created_at_index=2
project_updated_at_index=3
project_pushed_at_index=4


proj_age=[]
for item in repo_data:
    tmp=[]
    tmp.append(('created_time',item[project_created_at_index]))
    tmp.append(('updated_time',item[project_updated_at_index]))
    tmp.append(('pushed_time',item[project_pushed_at_index]))
    tmp=dict(tmp)
    proj_age.append((item[repo_id_index],tmp))

proj_age=dict(proj_age)
proj_age=project_age(proj_age)

Proj_age=[]
for item in process_data:
    for i in repo_data:
        if item[repo_name_index]==i[repo_data_name_index]:
            Proj_age.append(proj_age[i[repo_id_index]])





##是否为工作日
is_weekday=[]
for item in process_data:
    is_weekday.append((item[pr_id_index],item[created_at_index]))
is_weekday=dict(is_weekday)
is_weekday=is_weekday_commit(is_weekday)



###获取PR的label数
label_dict=[]
for item in process_data:
    label_dict.append((item[pr_id_index],item[content_index]))
label_dict=dict(label_dict)
label_dict=get_label_count(label_dict)

##获取每个PR提交时，open PR的数量
workload=[]
for item in process_data:
    tmp=[]
    tmp.append(('created_time',item[created_at_index]))
    tmp.append(('closed_time',item[closed_at_index]))
    tmp=dict(tmp)
    workload.append((item[pr_id_index],tmp))

workload=dict(workload)
workload=get_workload(workload)

# 获取该pr提交之前，提交者提交的pr数量

pre_prs=[]
for item in process_data:
    tmp=[]
    tmp.append(('pr_user_name',item[pr_user_name_index]))
    tmp=dict(tmp)
    pre_prs.append((item[pr_id_index],tmp))

pre_prs=dict(pre_prs)
pre_prs=get_prev_prs(pre_prs)

#获取该pr提交之前，提交者提交的pr中的代码更改总数
change_num=[]
for item in process_data:
    tmp=[]
    tmp.append(('pr_user_name',item[pr_user_name_index]))
    tmp.append(('changed_line_num',item[total_add_line_index]+item[total_delete_line_index]))
    tmp=dict(tmp)
    change_num.append((item[pr_id_index],tmp))

change_num=dict(change_num)
change_num=get_change_num(change_num)


# 获取该pr提交之前，提交者提交的pr的接受总数
accept_num=[]
for item in process_data:
    tmp=[]
    tmp.append(('pr_user_name',item[pr_user_name_index]))
    tmp.append(('merged_at',item[merged_at_index]))
    tmp=dict(tmp)
    accept_num.append((item[pr_id_index],tmp))

accept_num=dict(accept_num)
accept_num=get_accept_num(accept_num)



#获取该pr提交之前，提交者提交的pr的关闭总数
close_num=[]

for item in process_data:
    tmp=[]
    tmp.append(('pr_user_name',item[pr_user_name_index]))
    tmp.append(('created_time',item[created_at_index]))
    tmp.append(('closed_time',item[closed_at_index]))
    tmp.append(('merged_at',item[merged_at_index]))
    tmp=dict(tmp)
    close_num.append((item[pr_id_index],tmp))

close_num=dict(close_num)
close_num=get_close_num(close_num)



# 该pr作者之前评审过多少pr review_comments
review_num=[]

for item in process_data:
    tmp=[]
    tmp.append(('pr_user_name',item[pr_user_name_index]))
    tmp.append(('created_time',item[created_at_index]))
    tmp.append(('closed_time',item[closed_at_index]))
    tmp.append(('review_comments_number',item[review_comments_number_index]))
    tmp.append(('review_comments_content',item[review_comments_content_index]))
    tmp=dict(tmp)
    review_num.append((item[pr_id_index],tmp))

review_num=dict(review_num)
review_num=get_review_num(review_num)





#获取该pr的参与者之和，包括提交者，评论者，评审者人数
participants_count=[]
for item in process_data:
    tmp=[]
    tmp.append(('pr_user_name',item[pr_user_name_index]))
    tmp.append(('created_time',item[created_at_index]))
    tmp.append(('closed_time',item[closed_at_index]))
    tmp.append(('comments_number',item[comments_number_index]))
    tmp.append(('comments_content',item[comments_content_index]))
    tmp.append(('review_comments_number',item[review_comments_number_index]))
    tmp.append(('review_comments_content',item[review_comments_content_index]))
    tmp=dict(tmp)
    participants_count.append((item[pr_id_index],tmp))

participants_count=dict(participants_count)
participants_count=get_participants_count(participants_count)



title_words=[]
for item in process_data:
    title_words.append(wordCount(item[title_index]))


body_words=[]
for item in process_data:
    body_words.append(wordCount(item[body_index]))


key_words=['bug','document','feature','improve','refactor']
has_key_words=[]
for item in process_data:
    tmp=[]
    if item[body_index] is None:
        tmp=[0,0,0,0,0]
    else:
        for k in  key_words:
            if k in item[body_index]:
                tmp.append(1)
            else:
                tmp.append(0)
    has_key_words.append(tmp)


#  获取pr作者在代码仓的总提交成功率,接受概率，总的贡献给率，代码仓的贡献率



pr_data=[]
for item in process_data:
    tmp=[]
    tmp.append(('pr_user_name',item[pr_user_name_index]))
    tmp.append(('created_time',item[created_at_index]))
    tmp.append(('closed_time',item[closed_at_index]))
    tmp.append(('comments_number',item[comments_number_index]))
    tmp.append(('comments_content',item[comments_content_index]))
    tmp.append(('review_comments_number',item[review_comments_number_index]))
    tmp.append(('review_comments_content',item[review_comments_content_index]))
    tmp.append(('total_add_line',item[total_add_line_index]))
    tmp.append(('total_delete_line',item[total_delete_line_index]))
    tmp.append(('commit_number',item[commit_number_index]))
    tmp.append(('merged_time',item[merged_at_index]))
    tmp=dict(tmp)
    pr_data.append((item[pr_id_index],tmp))

pr_data=dict(pr_data)


#  获取pr作者在代码仓的总提交成功率,接受概率，总的贡献给率，代码仓的贡献率
pr_author_rate=get_pr_author_rate(pr_data)
    #    {
    #          52949: {
    #          'self_accept_rate': 1.0,
    #          'self_closed_num_rate': 1.0,
    #          'self_contribution_rate': 5.17437648763324e-05,
    #          'project_accept_rate': 0.7065093656214426
    #          },
    #    }


# 获取project上一周的平均删除，增加，改变的行的数量
project_line_rate=get_project_line_rate(pr_data)
    #    {
    #          52949: {
    #          'deletions_per_week': 63469.40064102564,
    #          'additions_per_week': 194976.12820512822,
    #          'changes_per_week': 258445.52884615384
    #          },
    #    }


# 计算pr根据所在周的周几，判断该周几的平均修改的行数，增加的数量，删除的数量
line_weekday_rate=get_line_weekday_rate(pr_data)
    #    {
    #         52949: {
    #             'per_lines_deleted_week_days': 799.7732574679943,
    #             'per_lines_added_week_days': 3677.2375533428167,
    #             'per_lines_changed_week_days': 4477.010810810811
    #         },
    #    }



# 获取pr的平均删除，增加，改变的行的数量,不是一周一个单位了，而是pr的数量
project_line_churn_rate=get_project_line_churn_rate(pr_data)
# print(project_line_churn_rate)
    #    {
    #      52949: {
    #      'deletions_per_pr': 1024.7502845907068,
    #      'additions_per_pr': 3147.855634895995,
    #      'changes_per_pr': 4172.605919486702
    #      },
    #    }


# 根据当前pr创建的时间，计算所有pr的平均提交数量
commits_average=get_commits_average(pr_data)
# print(commits_average)
    #     {
    #         52946: 21.698354377975573,
    #         pr_number:commits_average
    #    }


# 根据当前pr创建的时间，计算所有pr的平均评论数，以及合并的pr的平均评论数
avg_comments=get_avg_comments(pr_data)
# print(avg_comments)

    #    {
    #         52948: {
    #         'comments_per_closed_pr': 5.77386725063872,
    #         'comments_per_merged_pr': 5.345905961622968
    #         },
    #    }

# 计算pr的合并时间，计算，从pr的打开状态到合并状态的平均天数，以及从打开状态到关闭状态的平均天数
avg_latency=get_avg_latency(pr_data)
    #     {
    #      52948: {
    #      'close_latency': 19.34282287919078,
    #      'merge_latency': 12.824740002929545
    #      },
    #    }

X_dispersed=[]  ##离散类型值
X_successive=[] ##连续类型值



##item[3] PR提交者项目身份,item[4]PR是否含有label,
# item[9] PR的mergable_state
# ,item[10] 是否有assignees_content,item[11],comment数量
# item[13] review_comment数量,item[15] commit数量
# ,item[16] 修改文件数,item[17] 增加代码行数,item[18] 删除代码行数

for item in process_data:
    X_dispersed.append([item[pr_author_association_index],item[labels_index],item[mergeable_state_index]
            ,item[assignees_content_index]])


for item in process_data:
    X_successive.append([item[comments_number_index],item[review_comments_number_index],item[commit_number_index]
            ,item[changed_file_num_index],item[total_add_line_index],item[total_delete_line_index]])

for i in range(len(X_dispersed)):
    X_dispersed[i].append(pr_author_rate[i]['self_accept_rate'])
    X_dispersed[i].append(pr_author_rate[i]['self_closed_num_rate'])
    X_dispersed[i].append(pr_author_rate[i]['self_contribution_rate'])
    X_dispersed[i].append(pr_author_rate[i]['project_accept_rate'])
    X_dispersed[i].append(is_weekday[i])




for i in range(len(X_successive)):
    # X_successive[i].append(Proj_age[i])    
    X_successive[i].append(label_dict[i])
    X_successive[i].append(workload[i])
    X_successive[i].append(pre_prs[i])
    X_successive[i].append(change_num[i])
    X_successive[i].append(accept_num[i])
    X_successive[i].append(close_num[i])
    X_successive[i].append(review_num[i])
    X_successive[i].append(participants_count[i])
    X_successive[i].append(title_words[i])
    X_successive[i].append(body_words[i])
    for k in has_key_words[i]:
        X_successive[i].append(k)

    
    
    X_successive[i].append(project_line_rate[i]['deletions_per_week'])
    X_successive[i].append(project_line_rate[i]['additions_per_week'])
    X_successive[i].append(project_line_rate[i]['changes_per_week'])
    X_successive[i].append(line_weekday_rate[i]['per_lines_deleted_week_days'])
    X_successive[i].append(line_weekday_rate[i]['per_lines_added_week_days'])
    X_successive[i].append(line_weekday_rate[i]['per_lines_changed_week_days'])
    X_successive[i].append(project_line_churn_rate[i]['deletions_per_pr'])
    X_successive[i].append(project_line_churn_rate[i]['additions_per_pr'])
    X_successive[i].append(project_line_churn_rate[i]['changes_per_pr'])
    X_successive[i].append(commits_average[i])
    X_successive[i].append(avg_comments[i]['comments_per_closed_pr'])
    X_successive[i].append(avg_comments[i]['comments_per_merged_pr'])
    X_successive[i].append(avg_latency[i]['close_latency'])
    X_successive[i].append(avg_latency[i]['merge_latency'])
    

###归一化
X_successive=np.array(X_successive)
mins = X_successive.min(0) #返回data矩阵中每一列中最小的元素，返回一个列表
maxs = X_successive.max(0) #返回data矩阵中每一列中最大的元素，返回一个列表
ranges = maxs - mins #最大值列表 - 最小值列表 = 差值列表
normData = np.zeros(np.shape(X_successive)) #生成一个与 data矩阵同规格的normData全0矩阵，用于装归一化后的数据
row = X_successive.shape[0] #返回 data矩阵的行数
normData = X_successive - np.tile(mins,(row,1)) #data矩阵每一列数据都减去每一列的最小值
normData = normData / np.tile(ranges,(row,1))
X_successive=normData.tolist()

X=[]
for i in range(len(X_dispersed)):
    tmp=[]
    for j in X_dispersed[i]:
        tmp.append(j)
    for j in X_successive[i]:
        tmp.append(j)
    X.append(tmp)



##获取每个PR的响应时间
first_response_time=[]
for item in process_data:
    tmp=[]
    tmp.append(('created_time',item[created_at_index]))
    tmp.append(('updated_time',item[updated_at_index]))
    tmp.append(('closed_time',item[closed_at_index]))
    tmp.append(('comments_number',item[comments_number_index]))
    tmp.append(('comments_content',item[comments_content_index]))
    tmp.append(('review_comments_number',item[review_comments_number_index]))
    tmp.append(('review_comments_content',item[review_comments_content_index]))
    tmp.append(('pr_user_name',item[pr_user_name_index]))
    tmp=dict(tmp)
    first_response_time.append((item[pr_id_index],tmp))

first_response_time=dict(first_response_time)

first_response_time=get_waiting_time(first_response_time)


##响应时间
Y1=[]
for item in first_response_time.keys():
    Y1.append(first_response_time[item])



####响应时间
Y_1=[]
for item in Y1:
    Y_1.append(item)
##是否被合并  (70%)
Y_2=[]
for item in process_data:
    Y_2.append(item[8])


##最终的输出label [快/慢响应，是/否合并]
Y=[]
for i in range(0,len(Y_1)):
    Y.append([Y_1[i],Y_2[i]])


import csv
import torch
import pandas as pd


headers=['Id',
        'author_identity',
        'has_labels',
        'mergable_state',
        'has_assignees_content',
        'comment_num',
        'review_comment_num',
        'commit_num',
        'file_changed_num',
        'total_add_line',
        'total_delete_line',
        'self_accept_rate',
        'self_closed_num_rate',
        'self_contribution_rate',
        'project_accept_rate',
        'is_weekday',
        'label_count',
        'workload',
        'pre_prs',
        'change_num',
        'accept_num',
        'close_num',
        'review_num',
        'participants_count',
        'title_words',
        'body_words',
        'has_bug',
        'has_document',
        'has_feature',
        'has_improve',
        'has_refactor',
        'deletions_per_week',
        'additions_per_week',
        'changes_per_week',
        'per_lines_deleted_week_days',
        'per_lines_added_week_days',
        'per_lines_changed_week_days',
        'deletions_per_pr',
        'additions_per_pr',
        'changes_per_pr',
        'commits_average',
        'comments_per_closed_pr',
        'comments_per_merged_pr',
        'close_latency',
        'merge_latency',
        'response_speed',
        'if_merged']
row_data=[]
count=0

for i in range(len(X)):
    tmp=[]
    tmp.append(count)
    count+=1
    for x in X[i]:
        tmp.append(x)
    for x in Y[i]:
        tmp.append(x)
    row_data.append(tmp)

# 保存数据到csv文件
with open('./nodes_data.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f, dialect='excel')
    writer.writerow(headers)
    for item in row_data:
        writer.writerow(item)








